In this document I’ll be downloading and processing the scRNA-seq data produced by Neftel et al in their 2019 Cell study, in which they use scRNA-seq to examine 28 glioblastoma tumour samples using both Smart-seq2 and 10x.
The document will be split into the following sections:
The source R Markdown document is available here:
Download Neftel_2019_GBM_scRNAseq.RmdThis document was created to gain familiarity with glioblastoma scRNA-seq datasets and scRNA-seq data analysis in general. All of the data and software used in this analysis are freely available - links to software and datasets are provided in the relevant sections.
In additon to caching code chunks, throughout the document you’ll notice statements along the lines of if(!file.exists(here("data/large_file.tsv"))){...}. I included these to avoid re-downloading large files and as a compromise to prevent unreasonably long running times when knitting the R Markdown document. These are generally included in code chunks written to produce intermediate files.
Start by loading any R packages that will be required throughout the analysis.
suppressMessages(library(tidyverse))
suppressMessages(library(GEOquery))
suppressMessages(library(readxl))
suppressMessages(library(Seurat))
suppressMessages(library(RColorBrewer))
suppressMessages(library(plotly))
suppressMessages(library(scales))
suppressMessages(library(glue))
The data for this project was uploaded to NCBI GEO and is available at accession GSE131928.
# Download GSE131928
if(!file.exists("data/Neftel_2019/GSE131928_series_matrix.txt.gz")){
dir.create("data/Neftel_2019")
gse <- getGEO(GEO = "GSE131928", destdir = "data/Neftel_2019", GSEMatrix = TRUE)
}
gse <- getGEO(filename = "data/Neftel_2019/GSE131928_series_matrix.txt.gz", GSEMatrix = TRUE)
## Parsed with column specification:
## cols(
## ID_REF = col_character(),
## GSM3828672 = col_character(),
## GSM3828673 = col_character()
## )
## File stored at:
## /tmp/RtmpUqvWnN/GPL18573.soft
# Take a quick look
head(pData(gse))
# Get GSE131928_RAW.tar if not already done
if(!file.exists("data/Neftel_2019/GSE131928_RAW.tar")){
getGEOSuppFiles(GEO="GSE131928", baseDir = "data/Neftel_2019", makeDirectory = FALSE, fetch_files = TRUE)
untar("data/Neftel_2019/GSE131928_RAW.tar", exdir="data/Neftel_2019/")
}
# Check to see the files we have in the data directory
list.files("data/Neftel_2019/")
## [1] "GPL18573.soft"
## [2] "GSE131928_RAW.tar"
## [3] "GSE131928_series_matrix.txt.gz"
## [4] "GSE131928_single_cells_tumor_name_and_adult_or_peidatric.xlsx"
## [5] "GSM3828672_Smartseq2_GBM_IDHwt_processed_TPM.tsv.gz"
## [6] "GSM3828673_10X_GBM_IDHwt_processed_TPM.tsv.gz"
## [7] "MGH143_CC_alt.rds"
## [8] "MGH143_CC.rds"
## [9] "MGH143_SCT_CC.rds"
## [10] "MGH143_SCT_noImmune.rds"
## [11] "MGH143_SCT.rds"
## [12] "MGH143_std_CC.rds"
## [13] "smartseq.integrated_ModScore.rds"
## [14] "smartseq.rds"
## [15] "SmartSeq2_integrated.rds"
## [16] "tenx_integrated_ModScore_noImmune.rds"
## [17] "tenx_integrated_ModScore.rds"
## [18] "tenx_integrated.rds"
## [19] "tenx.rds"
## [20] "test.tsv"
## [21] "three_samples_SCT.rds"
# import metadata
metadata <-read_excel(path="data/Neftel_2019/GSE131928_single_cells_tumor_name_and_adult_or_peidatric.xlsx", skip = 43)
# Import smartseq2 counts and save out as a Seurat object
if(!file.exists("data/Neftel_2019/smartseq.rds")){
smartseq <- CreateSeuratObject(counts=read.delim("data/Neftel_2019/GSM3828672_Smartseq2_GBM_IDHwt_processed_TPM.tsv.gz", sep = "\t", check.names = FALSE, row.names = 1), project = "Neftel_2019", min.cells = 0, min.features = 0)
saveRDS(smartseq, "data/Neftel_2019/smartseq.rds")
}
# Import 10x counts and save out as a Seurat object
if(!file.exists("data/Neftel_2019/tenx.rds")){
tenx <- CreateSeuratObject(counts=read.delim("data/Neftel_2019/GSM3828673_10X_GBM_IDHwt_processed_TPM.tsv.gz", sep = "\t", check.names = FALSE, row.names = 1), project = "Neftel_2019_10x", min.cells = 3, min.features = 200)
saveRDS(tenx, "data/Neftel_2019/tenx.rds")
}
tenx <- readRDS("data/Neftel_2019/tenx.rds")
Have a quick look at what the object contains.
# Note: The file name "GSM3828673_10X_GBM_IDHwt_processed_TPM.tsv.gz" suggests the expression values are in TPM, but they shouldn't be because it's 10x count data.
head(tenx[[]])
# We can see that the nCount_RNA (colSum of the expression counts) is approximately but not quite 1 million - for TPM it should be 1 million. If we look at this across all of the cells:
plot(unlist((tenx[["nCount_RNA"]])), col=Idents(tenx))
# Some cells have much lower values. THere is some variation among samples:
plot(unlist((tenx[["nCount_RNA"]])), ylim=c(970000, 1E6), col=Idents(tenx))
# For some reason there is variation wihin cells for the second (red) sample. I don't know why these aren't just raw counts
# Object dimensions
dim(tenx)
## [1] 22178 15112
# Cell identities
table(Idents(tenx))
##
## 102 105 105A 114 115 118 124 125 126 143
## 1822 3038 1386 473 1283 539 2415 1613 229 2314
# What are the cell names?
head(colnames(tenx))
## [1] "102_1" "102_2" "102_4" "102_5" "102_7" "102_8"
# "102" is a patient sample. How many patient samples?
table(str_split_fixed(colnames(tenx),"_",2)[,1])
##
## 102 105 105A 114 115 118 124 125 126 143
## 1822 3038 1386 473 1283 539 2415 1613 229 2314
# The number of cells/pt sample varies quite a bit.
# How many cells total?
length(colnames(tenx))
## [1] 15112
# What do the gene names look like?
head(rownames(tenx))
## [1] "RP11-34P13.7" "AP006222.2" "RP4-669L17.10" "RP5-857K21.4"
## [5] "RP11-206L10.9" "FAM87B"
# How many genes?
length(rownames(tenx))
## [1] 22178
# Are these adult or pediatric samples?
metadata %>% filter(!str_detect(metadata$`processed data file`, "Smartseq2")) %>% group_by(`tumour name`, `adult/pediatric`) %>% count %>% select(-n)
Now we can proceed through the standard pre-processing workflow for scRNA-seq data in Seurat. These steps are; - filtration of cells based on QC metrics - data normalization and scaling - the detection of highly variable features
Visualise QC metrics, and use these to filter cells.
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
tenx[["percent.mt"]] <- PercentageFeatureSet(object = tenx, pattern = "^MT-")
# Show QC metrics for the first 5 cells
head(x = tenx@meta.data, 5)
#Visualize QC metrics as a violin plot
VlnPlot(object = tenx, features = c("nFeature_RNA", "percent.mt"), ncol = 2)
There are a few points that jump out from this plot:
# Remove cells with > 20% mitochondrial reads, cells from samples 105 and 126
tenx <- subset(x = tenx, subset = percent.mt < 20 & orig.ident != 105 & orig.ident != 126)
dim(tenx)
## [1] 22178 11830
For now we’ll proceed with just one sample: MGH143, which had the most reads.
MGH143 <- subset(x = tenx, subset = orig.ident == 143)
We will use the new SCTransform function in place of the standard NormalizeData, FindVariableFeatures, ScaleData workflow from Seurat v2.
## SCTransform
# run sctransform if not already done
if(!file.exists("data/Neftel_2019/MGH143_SCT.rds")){
MGH143 <- SCTransform(MGH143, verbose = TRUE)
saveRDS(MGH143_SCT, "data/Neftel_2019/MGH143_SCT.rds")
} else {
MGH143 <- readRDS("data/Neftel_2019/MGH143_SCT.rds")
}
# These are now standard steps in the Seurat workflow for visualization and clustering
MGH143 <- RunPCA(MGH143, verbose = TRUE)
## PC_ 1
## Positive: UBE2C, KIAA0101, CKS1B, CENPK, CENPF, BIRC5, UBE2T, RNASEH2A, PTTG1, HMGB2
## NUSAP1, H2AFX, TYMS, CENPU, TPX2, SMC4, ZWINT, CCDC34, FAM64A, SMC2
## TOP2A, PBK, PRC1, CDKN3, GGH, MAD2L1, CKS2, WDR34, GTSE1, CDK1
## Negative: NUPR1, APOE, FCGRT, SOD2, LIPA, BEX4, LY96, SYNGR2, SLC16A3, CD68
## RNF13, TYMP, CORO1B, TPP1, PYCARD, FTL, RNASET2, ABCA1, HERPUD1, FUCA1
## CREG1, PLAUR, CD74, FTH1, BST2, MFSD1, UNC93B1, THEMIS2, RASSF4, SELM
## PC_ 2
## Positive: PCDH9, RGMA, FAM181B, LINC00461, NPAS3, F3, CHL1, EDNRB, TM7SF2, CA2
## CITED1, C21orf62, KLHDC8A, RND2, BICD1, LINC01003, SCRG1, CRYAB, TRIM9, NREP
## DCLK2, LDLRAD3, DOK5, AQP4, DPF3, ZFHX4, CTXN1, DCLK1, ITGB8, MAPT
## Negative: CDKN1A, HMGB2, GSTO1, TACC3, REEP4, COTL1, CENPF, UBE2C, CKS2, NUDT1
## TPX2, CENPW, FAM64A, CENPU, FXYD5, BIRC5, TOP2A, UBE2T, CAPG, PBK
## MAD2L1, SGOL2, KIAA0101, ZWINT, MKI67, NUSAP1, CCNA2, PRC1, CDKN3, MOB1A
## PC_ 3
## Positive: AKAP12, HMGA2, DCBLD2, PDP1, LARP6, SLC20A1, EMP1, BHLHE41, SERINC2, RND3
## STEAP3, RGS17, ITGA3, ERRFI1, POPDC3, CHRNA9, SHC1, SLC4A7, CAPN2, RDH10
## MEG3, TMEM158, ACOT7, SOD3, NRCAM, RP3-428L16.2, IGFBP5, SPHK1, NT5E, FHL2
## Negative: S100B, FAM181B, EDNRB, ID3, A2M, F3, SCRG1, BCAN, RGMA, DCLK2
## AQP4, CHL1, LMO2, PRCP, CITED1, KLHDC8A, IL1RAP, TTYH3, SPARCL1, ELOVL2
## IDH1, MLC1, COL9A3, NFIA, GNB4, PSRC1, ROBO2, CD9, PLSCR1, DPYSL5
## PC_ 4
## Positive: GAD1, GAS1, WSCD1, ASCL1, PROX1, FAM181B, HES6, DSEL, CHD7, CDKN2C
## TXNIP, BTG2, NREP, DNAJB2, SOX21, SOX11, MEX3A, RGS16, ETV5, CCNG2
## OLIG1, SEZ6L, DCX, IDH1, PTPRS, OLIG2, LDLRAD3, C2orf80, BEX1, RNF180
## Negative: CCL2, HES4, CA2, FAM129A, C1R, LMO2, NAMPT, TRIM47, CEBPD, AQP4
## CRYAB, NNMT, C21orf62, C1S, EHBP1, EFEMP1, SCRG1, SPARCL1, IL1RAP, DCLK1
## THY1, FERMT2, VCAM1, LPL, LMCD1, DEGS1, CAMK1, MSMO1, SFXN5, LINC00152
## PC_ 5
## Positive: BRIX1, DCAF13, PCNA, RAE1, EEF1E1, PSMD6, CTNNBL1, GNL3, MCM7, IFRD1
## GTPBP4, NASP, TRAP1, SLBP, SRP54, STMN4, NOP58, FRG1, PSAT1, AIMP2
## RPF2, IER2, ACTL6A, GNL2, ARL6IP6, HAT1, MCM3, UTP18, BCAS2, PHGDH
## Negative: FZD7, HS6ST1, LGR4, CAMK1, NRCAM, SHC1, PLXNA1, DHCR24, ABCA1, APOE
## MCHR1, ZNF436, MIR210HG, CHST2, SESN3, C21orf62, NPAS3, GALNT2, FAM20C, BHLHE41
## PTK2B, MDGA1, EFNB2, CD81, IFI27L2, NPTXR, ENDOD1, STK39, HS6ST2, RAMP1
MGH143 <- RunUMAP(MGH143, dims = 1:30, n.components = 2, verbose = TRUE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:21:04 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:21:04 Read 2309 rows and found 30 numeric columns
## 10:21:04 Using Annoy for neighbor search, n_neighbors = 30
## 10:21:04 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:21:04 Writing NN index file to temp file /tmp/RtmpUqvWnN/file5e036400ac9c
## 10:21:04 Searching Annoy index using 1 thread, search_k = 3000
## 10:21:05 Annoy recall = 100%
## 10:21:05 Commencing smooth kNN distance calibration using 1 thread
## 10:21:06 Initializing from normalized Laplacian + noise
## 10:21:06 Commencing optimization for 500 epochs, with 92534 positive edges
## 10:21:12 Optimization finished
MGH143 <- FindNeighbors(MGH143, dims = 1:30, verbose = TRUE)
## Computing nearest neighbor graph
## Computing SNN
MGH143 <- FindClusters(MGH143, verbose = TRUE)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2309
## Number of edges: 81661
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7882
## Number of communities: 11
## Elapsed time: 0 seconds
DimPlot(MGH143, label = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
### 3D UMAP plot Quick aside: we can plot in 3D as well, by requesting UMAP to embed in 3 dimensions:
# Embed in 3 dimensions
MGH143_3D <- RunUMAP(MGH143, dims = 1:30, n.components = 3, verbose = TRUE)
## 10:21:14 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:21:14 Read 2309 rows and found 30 numeric columns
## 10:21:14 Using Annoy for neighbor search, n_neighbors = 30
## 10:21:14 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:21:14 Writing NN index file to temp file /tmp/RtmpUqvWnN/file5e0365c84bdc8
## 10:21:14 Searching Annoy index using 1 thread, search_k = 3000
## 10:21:15 Annoy recall = 100%
## 10:21:15 Commencing smooth kNN distance calibration using 1 thread
## 10:21:15 Initializing from normalized Laplacian + noise
## 10:21:15 Commencing optimization for 500 epochs, with 92534 positive edges
## 10:21:22 Optimization finished
# Plot
my_df <- Embeddings(MGH143_3D, reduction = "umap") %>%
as_tibble() %>%
mutate(cluster = MGH143_3D$seurat_clusters)
p <- plot_ly(my_df, x = ~UMAP_1, y = ~UMAP_2, z = ~UMAP_3,
color = ~cluster, colors = hue_pal()(length(levels(MGH143_3D$seurat_clusters))),
size=1) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'UMAP_1'),
yaxis = list(title = 'UMAP_2'),
zaxis = list(title = 'UMAP_3')))
p
# Clean up
rm(MGH143_3D, my_df, p)
Quick sanity check.
# EGFR and CD45
FeaturePlot(object = MGH143, features = c("EGFR","PTPRC"), reduction= "umap", label = TRUE)
Like we saw with the Darmanis et al 2017 dataset, the immune cell population forms a distinct cluster. Remove these for now, as we’re interested in looking at the expression states within the neoplastic cells. Note: following some downstream analysis, I’m not sure that cluster 10 is neoplastic cells either, we will remove cluster 10 for now.
if(!file.exists("data/Neftel_2019/MGH143_SCT_noImmune.rds")){
MGH143 <- subset(x = MGH143, subset = seurat_clusters != 7 & seurat_clusters != 10)
MGH143 <- SCTransform(MGH143, verbose = TRUE)
MGH143 <- RunPCA(MGH143, verbose = TRUE)
MGH143 <- RunUMAP(MGH143, dims = 1:30, n.components = 2, verbose = TRUE)
MGH143 <- FindNeighbors(MGH143, dims = 1:30, verbose = TRUE)
MGH143 <- FindClusters(MGH143, verbose = TRUE)
saveRDS(MGH143, "data/Neftel_2019/MGH143_SCT_noImmune.rds")
} else {
MGH143 <- readRDS("data/Neftel_2019/MGH143_SCT_noImmune.rds")
}
Check association with cell cycle.
# Perform cell cycle scoring
MGH143 <- CellCycleScoring(MGH143, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
## Warning: The following features are not present in the object: UHRF1, MLF1IP,
## CASP8AP2, not searching for symbol synonyms
FeaturePlot(object = MGH143, features = c("S.Score", "G2M.Score"), reduction= "umap", blend = TRUE)
Cell cycle has a large influence, but Neftel et al identified OPC-like and NPC-like expression states contain a higher proportion of cycling cells. If we regress out the effect of cell cycle we might also remove some of the biological signal associated with these expression states. We will leave it as is for now.
Let’s try to map some of the cells to different expression states
# Import modules
Neftel_2019_states <- read_delim("../marker_genes/data/Nefel_2019_states.tsv", "\t", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
## MES2 = col_character(),
## MES1 = col_character(),
## AC = col_character(),
## OPC = col_character(),
## NPC1 = col_character(),
## NPC2 = col_character(),
## `G1/S` = col_character(),
## `G2/M` = col_character()
## )
classical <- read_delim("../marker_genes/data/Wang_2017_classical.tsv","\t", escape_double = FALSE, trim_ws = TRUE) %>% select(GeneSymbol) %>% rename(classical=GeneSymbol)
## Parsed with column specification:
## cols(
## GeneSymbol = col_character(),
## avg_expr = col_double(),
## `ttest_p-val` = col_double()
## )
proneural <- read_delim("../marker_genes/data/Wang_2017_proneural.tsv","\t", escape_double = FALSE, trim_ws = TRUE) %>% select(GeneSymbol) %>% rename(proneural=GeneSymbol)
## Parsed with column specification:
## cols(
## GeneSymbol = col_character(),
## avg_expr = col_double(),
## `ttest_p-val` = col_double()
## )
mesenchymal <- read_delim("../marker_genes/data/Wang_2017_mesenchymal.tsv","\t", escape_double = FALSE, trim_ws = TRUE) %>% select(GeneSymbol) %>% rename(mesenchymal=GeneSymbol)
## Parsed with column specification:
## cols(
## GeneSymbol = col_character(),
## avg_expr = col_double(),
## `ttest_p-val` = col_double()
## )
# Add module scores
MGH143 <- AddModuleScore(object = MGH143, features = Neftel_2019_states, name = colnames(Neftel_2019_states))
## Warning: The following features are not present in the object: ERO1L, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: SERPINA3, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: PPAP2B, NA, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: GPR17, OMG, SOX2-
## OT, LPPR1, CA10, not searching for symbol synonyms
## Warning: The following features are not present in the object: TUBB3, CD24,
## NPPA, GPR56, not searching for symbol synonyms
## Warning: The following features are not present in the object: CD24, HMP19,
## TUBB3, DLX6-AS1, DLX5, IGFBPL1, LOC150568, not searching for symbol synonyms
## Warning: The following features are not present in the object: MLF1IP, NA, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: NA, not searching
## for symbol synonyms
MGH143 <- AddModuleScore(object = MGH143, features = classical, name = colnames(classical))
## Warning: The following features are not present in the object: KIAA0494, not
## searching for symbol synonyms
MGH143 <- AddModuleScore(object = MGH143, features = proneural, name = colnames(proneural))
## Warning: The following features are not present in the object: GPR17, KLRC3,
## NPPA, CA10, PAK7, SOX10, ZNF643, JPH3, IL1RAPL1, VIPR2, SLC17A6, KLRK1, GABRB3,
## KLRC4, not searching for symbol synonyms
MGH143 <- AddModuleScore(object = MGH143, features = mesenchymal, name = colnames(mesenchymal))
## Warning: The following features are not present in the object: PTGS2, not
## searching for symbol synonyms
# MES-like 1
FeaturePlot(object = MGH143, features = c("MES12"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
# MES-like 2
FeaturePlot(object = MGH143, features = c("MES21"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
# AC-like
FeaturePlot(object = MGH143, features = c("AC3"), reduction= "umap", cols = brewer.pal(9, "YlOrBr"), label = TRUE)
# OPC-like
FeaturePlot(object = MGH143, features = c("OPC4"), reduction= "umap", cols = brewer.pal(9, "Greens"), label = TRUE)
# NPC-like 1
FeaturePlot(object = MGH143, features = c("NPC15"), reduction= "umap", cols = brewer.pal(9, "Blues"), label = TRUE)
# NPC-like 2
FeaturePlot(object = MGH143, features = c("NPC26"), reduction= "umap", cols = brewer.pal(9, "Blues"), label = TRUE)
# Wang et al subtypes
FeaturePlot(object = MGH143, features = c("classical1"), reduction= "umap", cols = brewer.pal(9, "Blues"), label = TRUE)
FeaturePlot(object = MGH143, features = c("proneural1"), reduction= "umap", cols = brewer.pal(9, "Greens"), label = TRUE)
FeaturePlot(object = MGH143, features = c("mesenchymal1"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
Now that we’ve looked at one sample, let’s integrate the remaining samples and study all of the cells together. We’ll do this using Seurat v3’s integration capabilities.
# Integrate datasets if not already done
if(!file.exists("data/Neftel_2019/tenx_integrated.rds")){
# Create a list of Seurat objects to integrate
tenx.list <- SplitObject(tenx, split.by = "orig.ident")
# run SCTransform on each object separately
for (i in 1:length(tenx.list)) {
tenx.list[[i]] <- SCTransform(tenx.list[[i]], verbose = TRUE, return.only.var.genes=FALSE)
}
# select features for downstream integration
tenx.features <- SelectIntegrationFeatures(object.list = tenx.list, nfeatures = 3000)
# run PrepSCTIntegration, which ensures that all necessary Pearson residuals have been calculated
tenx.list <- PrepSCTIntegration(object.list = tenx.list, anchor.features = tenx.features, verbose = TRUE)
# identify anchors
tenx.anchors <- FindIntegrationAnchors(object.list = tenx.list, normalization.method = "SCT", anchor.features = tenx.features, verbose = FALSE)
# integrate the datasets
tenx.integrated <- IntegrateData(anchorset = tenx.anchors, normalization.method = "SCT", verbose = FALSE)
# proceed with downstream analysis (i.e. visualization, clustering) on the integrated dataset
tenx.integrated <- RunPCA(tenx.integrated, verbose = TRUE)
tenx.integrated <- RunUMAP(tenx.integrated, dims = 1:30)
saveRDS(tenx.integrated, "data/Neftel_2019/tenx_integrated.rds")
} else {
tenx.integrated <- readRDS("data/Neftel_2019/tenx_integrated.rds")
}
# Find clusters
tenx.integrated <- FindNeighbors(tenx.integrated, dims = 1:30, verbose = TRUE)
## Computing nearest neighbor graph
## Computing SNN
tenx.integrated <- FindClusters(tenx.integrated, verbose = TRUE)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11830
## Number of edges: 562090
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8568
## Number of communities: 17
## Elapsed time: 1 seconds
# Plot UMAP
DimPlot(tenx.integrated, label = TRUE)
DimPlot(tenx.integrated, group.by = c("orig.ident"))
DimPlot(tenx.integrated, split.by = c("orig.ident"), ncol = 2)
# Distinguish tumour from immune cell population
FeaturePlot(object = tenx.integrated, features = c("EGFR","PTPRC"), reduction= "umap", slot="scale.data")
# The cluster on the left appears to be the neoplastic cells, the cluster on the right appears to be the immune cells
# Add module scores and save out - if not already done
if(!file.exists("data/Neftel_2019/tenx_integrated_ModScore.rds")){
# The AddModuleScore function was working using the integrated data which contains 3000 rows, so most of the genes from the modules were missing, so I changed the active assay to SCT
DefaultAssay(object = tenx.integrated) <- "SCT"
tenx.integrated <- AddModuleScore(object = tenx.integrated, features = Neftel_2019_states, name = colnames(Neftel_2019_states))
tenx.integrated <- AddModuleScore(object = tenx.integrated, features = classical, name = colnames(classical))
tenx.integrated <- AddModuleScore(object = tenx.integrated, features = proneural, name = colnames(proneural))
tenx.integrated <- AddModuleScore(object = tenx.integrated, features = mesenchymal, name = colnames(mesenchymal))
tenx.integrated <- CellCycleScoring(tenx.integrated, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
saveRDS(tenx.integrated, "data/Neftel_2019/tenx_integrated_ModScore.rds")
} else {
tenx.integrated <- readRDS("data/Neftel_2019/tenx_integrated_ModScore.rds")
}
## Plot module scores
# Cluster 4 seems to capture the cycling neoplastic cells
FeaturePlot(object = tenx.integrated, features = c("S.Score", "G2M.Score"), reduction= "umap", label = TRUE)
# Neftel et al subtypes
FeaturePlot(object = tenx.integrated, features = c("MES12"), reduction= "umap", cols = brewer.pal(9, "Reds"))
FeaturePlot(object = tenx.integrated, features = c("MES21"), reduction= "umap", cols = brewer.pal(9, "Reds"))
FeaturePlot(object = tenx.integrated, features = c("AC3"), reduction= "umap", cols = brewer.pal(9, "YlOrBr"))
FeaturePlot(object = tenx.integrated, features = c("OPC4"), reduction= "umap", cols = brewer.pal(9, "Greens"))
FeaturePlot(object = tenx.integrated, features = c("NPC15"), reduction= "umap", cols = brewer.pal(9, "Blues"))
FeaturePlot(object = tenx.integrated, features = c("NPC26"), reduction= "umap", cols = brewer.pal(9, "Blues"))
# Wang et al subtypes
FeaturePlot(object = tenx.integrated, features = c("classical1"), reduction= "umap", cols = brewer.pal(9, "Blues"), label = TRUE)
FeaturePlot(object = tenx.integrated, features = c("proneural1"), reduction= "umap", cols = brewer.pal(9, "Greens"), label = TRUE)
FeaturePlot(object = tenx.integrated, features = c("mesenchymal1"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
# Remove clusters 14, 8, 15, 1, 5, 9 & 10
if(!file.exists("data/Neftel_2019/tenx_integrated_ModScore_noImmune.rds")){
tenx.integrated <- subset(x = tenx.integrated, subset = seurat_clusters != 14 & seurat_clusters != 8 & seurat_clusters != 15 & seurat_clusters != 1 & seurat_clusters != 5 & seurat_clusters != 9 & seurat_clusters != 10 & seurat_clusters != 7)
saveRDS(tenx.integrated, "data/Neftel_2019/tenx_integrated_ModScore_noImmune.rds")
} else {
tenx.integrated <- readRDS("data/Neftel_2019/tenx_integrated_ModScore_noImmune.rds")
}
DefaultAssay(object = tenx.integrated) <- "integrated"
tenx.integrated <- RunPCA(tenx.integrated, verbose = TRUE)
## PC_ 1
## Positive: SRGN, CD74, S100A11, LAPTM5, TYROBP, CYBA, ALOX5AP, FCER1G, HLA-DRB1, HLA-DRA
## HLA-DPA1, RNASET2, C1QB, C1QC, C1QA, AIF1, NPC2, CD68, ARHGDIB, HLA-DPB1
## HLA-E, VAMP8, FCGR3A, RGS1, CD14, APOC1, VSIG4, ADORA3, RGS10, CAPG
## Negative: TUBB2B, NOVA1, NGFRAP1, SOX4, PCSK1N, PTN, PFN2, MDK, FXYD6, GPM6B
## PRDX2, GPM6A, UCHL1, MLLT11, MARCKSL1, TSC22D1, RBP1, STMN4, MAP1B, SOX2
## YWHAQ, SRI, NFIB, MARCKS, BEX2, C7orf55, SCG3, YWHAE, PTMS, APLP1
## PC_ 2
## Positive: TERF2IP, MAP1B, TUBB2A, STMN4, ARL4C, MLLT11, DNAJB6, TCF4, TTC3, SOX4
## CITED2, BNIP3L, KLC1, APP, ATP6V1G1, RBFOX2, GAP43, EIF4A2, CAMK2N1, KMT2E
## ID2, NFIB, UCHL1, BEX2, TCEAL7, APLP1, CXXC5, MAGED2, CXADR, EIF4G2
## Negative: MT2A, BCAN, MT3, PON2, SCRG1, RAMP1, GATM, OLIG1, PHGDH, RPA3
## TMEM100, CD9, RGCC, GPM6B, HES6, TSC22D4, PSAT1, PTPRZ1, PTTG1, MT1M
## NTRK2, DTYMK, DCXR, POLE4, HMGB2, METRN, APOD, SPARCL1, RNASEH2A, SCD5
## PC_ 3
## Positive: HMGB2, CKS2, SNRPB, H2AFZ, MAD2L1, HMGB1, SMC4, HIST1H4C, PTTG1, DEK
## PCNA, NUSAP1, CENPF, DDX39A, NUDT1, UBE2S, KPNA2, HMGB3, MCM7, TMPO
## H2AFV, SRSF3, NASP, EZH2, RNASEH2B, H2AFX, LSM5, IER2, DTYMK, ARL6IP6
## Negative: CLU, PTGDS, S100B, APOD, S100A6, CPE, CRYAB, CBR1, PLP1, SCG5
## TSC22D4, TM7SF2, FIBIN, ALDOC, PLA2G16, SLC22A17, C1orf122, CD9, LYRM5, FAM127A
## NPDC1, TPPP3, PTN, FXYD6, SPARC, SLC44A1, MT1E, CD81, RGCC, SHISA4
## PC_ 4
## Positive: HES6, SCD5, CKB, BCAN, PRDX2, MT-ND3, NDUFA3, SLC25A5, RAMP1, TSC22D4
## HMGB1, GPM6B, OLIG1, S100B, LSM4, NME1, NDUFA11, SUB1, APOA1BP, MARCKS
## PTPRZ1, NOVA1, C1orf61, OLIG2, ATP5I, POLR2L, SCRG1, MARCKSL1, METRN, PFN2
## Negative: DDIT3, ENO2, BNIP3L, SARS, SQSTM1, HSPA9, MAFB, DARS, GAP43, S100A10
## BNIP3, PGK1, NRN1, GARS, EIF4A2, SUGT1, SCG2, CIR1, SLC3A2, ASNS
## NUPR1, ENO1, ATF4, TAF1D, RAB1A, ENOPH1, SYF2, GADD45A, TAGLN2, NDRG1
## PC_ 5
## Positive: MT2A, MT3, SPARCL1, MT1X, CEBPD, CD9, GFAP, MT1M, IGFBP2, PHGDH
## LGALS1, VIM, S100A10, C1orf61, SLC3A2, IGFBP5, GPM6B, NTRK2, VEGFA, LDHA
## CLU, PGK1, CD99, ENO1, PKM, HEY1, TMEM100, GADD45A, HMGB2, TIMP1
## Negative: C4orf48, MLLT11, GSTA4, AK1, ATAT1, KLC1, NFIB, MARCKS, TERF2IP, TSPAN13
## UCHL1, APLP1, COTL1, DPYSL3, ACOT7, PPP1R14B, C9orf16, ATP1B3, SH3BGRL3, CHD4
## TCF4, NPDC1, CCDC90B, BEX1, CXADR, MAP2, BEX2, CRMP1, PTPRN2, TXN
tenx.integrated <- RunUMAP(tenx.integrated, dims = 1:30)
## 10:22:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:22:55 Read 6625 rows and found 30 numeric columns
## 10:22:55 Using Annoy for neighbor search, n_neighbors = 30
## 10:22:55 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:22:56 Writing NN index file to temp file /tmp/RtmpUqvWnN/file5e0369b54616
## 10:22:56 Searching Annoy index using 1 thread, search_k = 3000
## 10:22:59 Annoy recall = 100%
## 10:22:59 Commencing smooth kNN distance calibration using 1 thread
## 10:23:00 Initializing from normalized Laplacian + noise
## 10:23:00 Commencing optimization for 500 epochs, with 315052 positive edges
## 10:23:20 Optimization finished
tenx.integrated <- FindNeighbors(tenx.integrated, dims = 1:30, verbose = TRUE)
## Computing nearest neighbor graph
## Computing SNN
tenx.integrated <- FindClusters(tenx.integrated, verbose = TRUE)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6625
## Number of edges: 333787
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8071
## Number of communities: 11
## Elapsed time: 0 seconds
# Plot UMAP
DimPlot(tenx.integrated, label = TRUE)
DimPlot(tenx.integrated, group.by = c("orig.ident"))
DimPlot(tenx.integrated, split.by = c("orig.ident"), ncol = 2)
## Plot module scores
# Cluster 4 seems to capture the cycling neoplastic cells
FeaturePlot(object = tenx.integrated, features = c("S.Score", "G2M.Score"), reduction= "umap", label = TRUE)
# Neftel et al subtypes
FeaturePlot(object = tenx.integrated, features = c("OPC4"), reduction= "umap", cols = brewer.pal(9, "YlGn"))
FeaturePlot(object = tenx.integrated, features = c("AC3"), reduction= "umap", cols = brewer.pal(9, "YlOrBr"))
FeaturePlot(object = tenx.integrated, features = c("MES12"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = tenx.integrated, features = c("MES21"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = tenx.integrated, features = c("NPC26"), reduction= "umap", cols = brewer.pal(9, "PuBu"))
FeaturePlot(object = tenx.integrated, features = c("NPC15"), reduction= "umap", cols = brewer.pal(9, "PuBu"))
# Wang et al subtypes
FeaturePlot(object = tenx.integrated, features = c("classical1"), reduction= "umap", cols = brewer.pal(9, "PuBu"), label = TRUE)
FeaturePlot(object = tenx.integrated, features = c("proneural1"), reduction= "umap", cols = brewer.pal(9, "Greens"), label = TRUE)
FeaturePlot(object = tenx.integrated, features = c("mesenchymal1"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
smartseq <- readRDS("data/Neftel_2019/smartseq.rds")
Have a quick look at what the object contains.
# Have a quick look at some of the data
smartseq@assays$RNA[1:10,1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
## [[ suppressing 10 column names 'MGH101-P1-A04', 'MGH101-P1-A05', 'MGH101-P1-A07' ... ]]
##
## A1BG . . . . . .
## A1BG-AS1 . . . . . .
## A1CF . . . . . 0.01292617
## A2M . 5.327184 . 8.698337 8.20509433 7.35516607
## A2M-AS1 . . . . . .
## A2ML1 0.06488285 . 0.07176267 0.077243 0.04684025 .
## A2MP1 . . . . . .
## A4GALT . . . . . .
## A4GNT . . . . . .
## AA06 . . . . . .
##
## A1BG . . . .
## A1BG-AS1 . 0.09626185 . .
## A1CF . . 0.0229004 .
## A2M 5.9685055 8.97614307 5.7860997 2.589524
## A2M-AS1 . . . .
## A2ML1 0.1890338 . 0.1686420 .
## A2MP1 . . . .
## A4GALT . . . .
## A4GNT . . . .
## AA06 . . . .
# Dimensions?
dim(smartseq)
## [1] 23686 7930
# how many samples are present?
samples <- str_split_fixed(colnames(smartseq), "-", 2)[,1]
length(table(samples))
## [1] 35
# How many cells?
length(samples)
## [1] 7930
# ~ how many cells/sample?
table(samples)
## samples
## BT1160 BT1187 BT749 BT771 BT786
## 335 219 317 339 212
## BT830 BT920 MGH100 MGH101 MGH102
## 220 301 209 200 312
## MGH104 MGH104negP2 MGH104negP4 MGH104negP7 MGH105A
## 139 65 74 77 160
## MGH105B MGH105C MGH105D MGH106 MGH106CD3posP1
## 122 167 185 135 65
## MGH110 MGH113 MGH115 MGH121 MGH122
## 359 260 165 297 273
## MGH124 MGH125 MGH128CD45neg MGH129CD45neg MGH136CD45neg
## 370 399 184 192 231
## MGH143CD45neg MGH151CD45neg MGH152CD45neg MGH66 MGH85
## 274 163 229 436 245
plot(sort(table(samples)), las=2, xlab="", main=glue('mean # cells/ sample {round(mean(table(samples)))}')); abline(h=mean(table(samples)), col="red", lty=2)
# The file from which this Seurat object was created "GSM3828672_Smartseq2_GBM_IDHwt_processed_TPM.tsv.gz" suggests the values should be TPM
head(smartseq[[]])
# But nCount_RNA (colSum of the expression counts) is not 1 million - for TPM it should be. If we look at this across all of the cells:
plot(unlist((smartseq[["nCount_RNA"]])), col=as.factor(samples))
# I'm not sure what these units are but they aren't TPM
# What do the Smart-seq2 cell names look like?
head(colnames(smartseq))
## [1] "MGH101-P1-A04" "MGH101-P1-A05" "MGH101-P1-A07" "MGH101-P1-A09"
## [5] "MGH101-P1-A10" "MGH101-P1-A11"
# "MGH101" is a patient sample, "P1" is probably the plate & "A04" is probably the well
# What do the gene names look like?
head(rownames(smartseq))
## [1] "A1BG" "A1BG-AS1" "A1CF" "A2M" "A2M-AS1" "A2ML1"
# How many genes?
length(rownames(smartseq))
## [1] 23686
# How many adult vs pediatric samples?
metadata %>% filter(str_detect(metadata$`processed data file`, "Smartseq2")) %>% group_by(`tumour name`, `adult/pediatric`) %>% count %>% select(-n) %>% arrange %>% table
## adult/pediatric
## tumour name adult pediatric
## BT1160 0 1
## BT1187 0 1
## BT749 0 1
## BT771 0 1
## BT786 0 1
## BT830 0 1
## BT920 0 1
## MGH100 1 0
## MGH101 1 0
## MGH102 1 0
## MGH104 1 0
## MGH105 1 0
## MGH106 1 0
## MGH110 1 0
## MGH113 1 0
## MGH115 1 0
## MGH121 1 0
## MGH122 1 0
## MGH124 1 0
## MGH125 1 0
## MGH128 1 0
## MGH129 1 0
## MGH136 1 0
## MGH143 1 0
## MGH151 1 0
## MGH152 1 0
## MGH66 1 0
## MGH85 0 1
Change the cell identity to the patient sample name e.g. “MGH101” and add a metadata adult_pediatric metadata column.
# Change cell identity to patient sample name
smartseq <- SetIdent(smartseq, value = samples)
# Add metadata column with info about whether sample is adult or pediatric
smartseq$adult_pediatric <- metadata$`adult/pediatric`[match(colnames(smartseq), metadata$`Sample name`)]
Visualise QC metrics, and use these to filter cells.
# When i search for mitochondrial genes ("^MT-") there were none, so can't use this QC metric
# Show QC metrics for the first 5 cells
head(x = smartseq@meta.data, 5)
#Visualize QC metrics as a violin plot
VlnPlot(object = smartseq, features = c("nFeature_RNA"), ncol = 2)
Exclude samples with <100 cells
# Remove samples with less than 100 cells
smartseq$keep <- !Idents(smartseq)%in%names(which(table(Idents(smartseq))<100))
smartseq <- subset(x = smartseq, subset = keep == TRUE)
# Restrict to samples that aren't "pos" or "neg"
smartseq$keep <- Idents(smartseq)%in%metadata$`tumour name`
smartseq <- subset(x = smartseq, subset = keep == TRUE)
dim(smartseq)
## [1] 23686 5742
# Integrate datasets if not already done
if(!file.exists("data/Neftel_2019/SmartSeq2_integrated.rds")){
# Create a list of Seurat objects to integrate
smartseq.list <- SplitObject(smartseq, split.by = "ident")
# Restrict to two samples for now
smartseq.list <- c(smartseq.list[[1]],smartseq.list[[2]])
# run SCTransform on each object separately
for (i in 1:length(smartseq.list)) {
smartseq.list[[i]] <- SCTransform(smartseq.list[[i]], verbose = TRUE, return.only.var.genes=FALSE)
}
# select features for downstream integration
smartseq.features <- SelectIntegrationFeatures(object.list = smartseq.list, nfeatures = 3000)
# run PrepSCTIntegration, which ensures that all necessary Pearson residuals have been calculated
smartseq.list <- PrepSCTIntegration(object.list = smartseq.list, anchor.features = smartseq.features, verbose = TRUE)
# identify anchors - will give this error "Cannot find more nearest neighbours than there are points" because some samples have a small number of cells, so we set k.filter argument to NA
smartseq.anchors <- FindIntegrationAnchors(object.list = smartseq.list, normalization.method = "SCT", anchor.features = smartseq.features, verbose = TRUE, k.filter = NA)
# integrate the datasets (integrate ALL genes)
all_genes <- lapply(smartseq.list, row.names) %>% Reduce(intersect, .) # get gene names present in ALL SCTransform'd datasets
smartseq.integrated <- IntegrateData(anchorset = smartseq.anchors, normalization.method = "SCT", verbose = TRUE, features.to.integrate = all_genes)
### Throwing an error when we try use all_genes, waiting on Seurat author's response to GitHub issue
# proceed with downstream analysis on the integrated dataset
smartseq.integrated <- RunPCA(smartseq.integrated, verbose = TRUE)
smartseq.integrated <- RunUMAP(smartseq.integrated, dims = 1:30)
# Find clusters
smartseq.integrated <- FindNeighbors(smartseq.integrated, dims = 1:30, verbose = TRUE)
smartseq.integrated <- FindClusters(smartseq.integrated, verbose = TRUE)
saveRDS(smartseq.integrated, "data/Neftel_2019/SmartSeq2_integrated.rds")
} else {
smartseq.integrated <- readRDS("data/Neftel_2019/SmartSeq2_integrated.rds")
}
# Add sample name as a metadata column
smartseq.integrated$sample_id <- str_split_fixed(colnames(smartseq.integrated), "-", 2)[,1]
# Plot UMAP
DimPlot(smartseq.integrated, label = TRUE)
DimPlot(smartseq.integrated, group.by = c("sample_id"))
DimPlot(smartseq.integrated, group.by = c("adult_pediatric"))
DimPlot(smartseq.integrated, split.by = c("orig.ident"), ncol = 2)
# Distinguish tumour from immune cell population
DefaultAssay(object = smartseq.integrated) <- "SCT"
FeaturePlot(object = smartseq.integrated, features = c("EGFR","PTPRC"), reduction= "umap")
DefaultAssay(object = smartseq.integrated) <- "integrated"
# The cluster on the left appears to be the neoplastic cells, the cluster on the right appears to be the immune cells
# Add module scores and save out - if not already done
if(!file.exists("data/Neftel_2019/smartseq.integrated_ModScore.rds")){
# The AddModuleScore function was working using the integrated data which contains 3000 rows, so most of the genes from the modules were missing, so I changed the active assay to SCT
DefaultAssay(object = smartseq.integrated) <- "SCT"
smartseq.integrated <- AddModuleScore(object = smartseq.integrated, features = Neftel_2019_states, name = colnames(Neftel_2019_states))
smartseq.integrated <- AddModuleScore(object = smartseq.integrated, features = classical, name = colnames(classical))
smartseq.integrated <- AddModuleScore(object = smartseq.integrated, features = proneural, name = colnames(proneural))
smartseq.integrated <- AddModuleScore(object = smartseq.integrated, features = mesenchymal, name = colnames(mesenchymal))
smartseq.integrated <- CellCycleScoring(smartseq.integrated, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
saveRDS(smartseq.integrated, "data/Neftel_2019/smartseq.integrated_ModScore.rds")
} else {
smartseq.integrated <- readRDS("data/Neftel_2019/smartseq.integrated_ModScore.rds")
}
## Plot module scores
# Cluster 4 seems to capture the cycling neoplastic cells
FeaturePlot(object = smartseq.integrated, features = c("S.Score", "G2M.Score"), reduction= "umap", label = TRUE)
# Wang et al subtypes
FeaturePlot(object = smartseq.integrated, features = c("classical1"), reduction= "umap", cols = brewer.pal(9, "Blues"), label = TRUE)
FeaturePlot(object = smartseq.integrated, features = c("proneural1"), reduction= "umap", cols = brewer.pal(9, "Greens"), label = TRUE)
FeaturePlot(object = smartseq.integrated, features = c("mesenchymal1"), reduction= "umap", cols = brewer.pal(9, "Reds"), label = TRUE)
# Neftel et al subtypes
FeaturePlot(object = smartseq.integrated, features = c("OPC4"), reduction= "umap", cols = brewer.pal(9, "YlGn"))
FeaturePlot(object = smartseq.integrated, features = c("AC3"), reduction= "umap", cols = brewer.pal(9, "YlOrBr"))
FeaturePlot(object = smartseq.integrated, features = c("MES12"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = smartseq.integrated, features = c("MES21"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = smartseq.integrated, features = c("NPC26"), reduction= "umap", cols = brewer.pal(9, "PuBu"))
FeaturePlot(object = smartseq.integrated, features = c("NPC15"), reduction= "umap", cols = brewer.pal(9, "PuBu"))
It might be an idea to inspect the modules scores themselves and how they vary across samples
par(mar=c(0,4,0,0))
par(mfrow=c(2,1))
# Plot MES1 scores, coloured by sample
plot(smartseq.integrated$MES12, col=as.factor(str_split_fixed(colnames(smartseq.integrated), "-",2)[,1]), pch=16, xaxt="n", xlab="n", ylab="MES1 module scores")
# Plot MES1 scores, coloured by plate
plot(smartseq.integrated$MES12, col=as.factor(str_split_fixed(colnames(smartseq.integrated), "-",3)[,2]), pch=16, xaxt="n", xlab="n", ylab="MES1 module scores")
par(mfrow=c(1,1))
# Subset just the sample MGH66
MGH66 <- subset(x = smartseq, idents = "MGH66")
# run sctransform
MGH66 <- suppressWarnings(SCTransform(MGH66, verbose = FALSE, return.only.var.genes = FALSE))
# Run PCA
MGH66 <- RunPCA(MGH66, verbose = FALSE)
# Run UMAP
MGH66 <- RunUMAP(MGH66, dims = 1:30, verbose = FALSE)
# Find clusters
MGH66 <- FindNeighbors(MGH66, dims = 1:30, verbose = FALSE)
MGH66 <- FindClusters(MGH66, verbose = FALSE)
# Plot
DimPlot(MGH66, label = TRUE) + NoLegend()
# Detect modules
MGH66 <- AddModuleScore(object = MGH66, features = Neftel_2019_states, name = colnames(Neftel_2019_states))
## Warning: The following features are not present in the object: IGFBP3, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: NA, not searching
## for symbol synonyms
## Warning: The following features are not present in the object: CA10, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: STMN2, DLX5,
## CRABP1, not searching for symbol synonyms
## Warning: The following features are not present in the object: NA, not searching
## for symbol synonyms
## Warning: The following features are not present in the object: NA, not searching
## for symbol synonyms
MGH66 <- CellCycleScoring(MGH66, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
## Plot module scores
FeaturePlot(object = MGH66, features = c("S.Score", "G2M.Score"), reduction= "umap", label = TRUE)
# Neftel et al subtypes
FeaturePlot(object = MGH66, features = c("AC3"), reduction= "umap", cols = brewer.pal(9, "YlOrBr"))
FeaturePlot(object = MGH66, features = c("MES12"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = MGH66, features = c("MES21"), reduction= "umap", cols = brewer.pal(9, "YlOrRd"))
FeaturePlot(object = MGH66, features = c("OPC4"), reduction= "umap", cols = brewer.pal(9, "YlGn"))
FeaturePlot(object = MGH66, features = c("NPC26"), reduction= "umap", cols = brewer.pal(9, "PuBu"))
FeaturePlot(object = MGH66, features = c("NPC15"), reduction= "umap", cols = brewer.pal(9, "PuBu"))